In [1]:
import pandas as pd
import numpy as np
import os
import datetime
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn import tree
from sklearn import ensemble

import pytz
import itertools
import visualize
import utils
import pydotplus
import xgboost as xgb

from sklearn import metrics
from sklearn import model_selection

import pvlib
import cs_detection

import visualize_plotly as visualize
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import cufflinks as cf
cf.go_offline()
init_notebook_mode(connected=True)

from IPython.display import Image

%load_ext autoreload
%autoreload 2

np.set_printoptions(precision=4)
%matplotlib notebook

Ground predictions

PVLib Clearsky

Only making ground predictions using PVLib clearsky model and statistical model. NSRDB model won't be available to ground measurements.

In [2]:
nsrdb = cs_detection.ClearskyDetection.read_pickle('ornl_nsrdb_1.pkl.gz')
nsrdb.df.index = nsrdb.df.index.tz_convert('EST')
nsrdb.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
In [3]:
len(nsrdb.df)
Out[3]:
315552

Train/test on NSRDB data to find optimal parameters

Default classifier

In [4]:
train = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
train.trim_dates('01-01-2010', None)
test = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
test.trim_dates('01-01-2015', None)
In [5]:
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [6]:
clf = ensemble.RandomForestClassifier(random_state=42)
In [7]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [8]:
feature_cols = [
    'tfn',
    'abs_ideal_ratio_diff grad',
    'abs_ideal_ratio_diff grad mean', 
    'abs_ideal_ratio_diff grad std',
    'abs_ideal_ratio_diff grad second',
    'abs_ideal_ratio_diff grad second mean',
    'abs_ideal_ratio_diff grad second std',
    'GHI Clearsky GHI pvlib line length ratio',
    'GHI Clearsky GHI pvlib ratio', 
    'GHI Clearsky GHI pvlib ratio mean',
    'GHI Clearsky GHI pvlib ratio std',
    'GHI Clearsky GHI pvlib diff',
    'GHI Clearsky GHI pvlib diff mean', 
    'GHI Clearsky GHI pvlib diff std'
]

target_cols = ['sky_status']
import seaborn as snspplot = sns.pairplot(train.df[feature_cols + target_cols], hue='sky_status')from sklearn.cluster import KMeans, DBSCAN from sklearn.preprocessing import StandardScaler X = train.df[feature_cols + ['GHI', 'Clearsky GHI pvlib']].values X_std = StandardScaler().fit_transform(X) kmeans = DBSCAN().fit(X_std) # kmeans = KMeans(n_clusters=4, random_state=0).fit(X_std) train.df['sky_status_km'] = 0 train.df['sky_status_km'] = kmeans.labels_ pd.unique(train.df['sky_status_km'])train.df['sky_status_km'] = (train.df['sky_status_km'] == 0) | (train.df['sky_status_km'] == 0)
In [9]:
x = train.df.index
trace1 = go.Scatter(x=x, y=train.df['GHI'])
trace2 = go.Scatter(x=x, y=train.df['Clearsky GHI pvlib'])
trace3 = go.Scatter(x=train.df[train.df['sky_status']].index, 
                    y=train.df[train.df['sky_status']]['GHI'], name='NSRBD', mode='markers')
# trace3 = go.Scatter(x=train.df[train.df['sky_status'] & ~train.df['sky_status_km']].index, 
#                     y=train.df[train.df['sky_status'] & ~train.df['sky_status_km']]['GHI'], name='NSRBD', mode='markers')
# trace4 = go.Scatter(x=train.df[~train.df['sky_status'] & train.df['sky_status_km']].index, 
#                     y=train.df[~train.df['sky_status'] & train.df['sky_status_km']]['GHI'], name='km', mode='markers')
# trace5 = go.Scatter(x=train.df[train.df['sky_status'] & train.df['sky_status_km']].index, 
#                     y=train.df[train.df['sky_status'] & train.df['sky_status_km']]['GHI'], name='both', mode='markers')
iplot([trace1, trace2, trace3])# , trace4, trace5])
In [10]:
fig = tools.make_subplots(rows=2, cols=1, shared_xaxes=True)
x = train.df.index
trace1 = go.Scatter(x=x, y=train.df['GHI'], name='GHI')
trace2 = go.Scatter(x=x, y=train.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=train.df[train.df['sky_status']].index, 
                    y=train.df[train.df['sky_status']]['GHI'], name='NSRBD', mode='markers')
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 1)
fig.append_trace(trace3, 1, 1)

names = []
pcts = []
for name, group in train.df.groupby(train.df.index.year):
    names.append(group.index.tolist()[len(group.index) // 2])
    pcts.append(np.sum(group['sky_status']) / len(group))
    
trace4 = go.Bar(x=names, y=pcts, name='Pct clear')

fig.append_trace(trace4, 2, 1)

iplot(fig)
This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x1,y2 ]

vis = visualize.Visualizer() vis.plot_corr_matrix(train.df[feature_cols].corr(), feature_cols)
In [11]:
train = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
test.trim_dates('01-01-2015', None)
In [12]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())
Out[12]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)
In [13]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:336: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:336: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:336: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:336: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

In [14]:
metrics.accuracy_score(test.df['sky_status'], pred)
Out[14]:
0.96084474885844751
In [15]:
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')
vis.show()
In [16]:
cm = metrics.confusion_matrix(test.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])
In [17]:
bar = go.Bar(x=feature_cols, y=clf.feature_importances_)
iplot([bar])

Gridsearch

In [18]:
import warnings
with warnings.catch_warnings(): warnings.simplefilter('ignore') params={} params['max_depth'] = [4, 8, 12, 16] params['n_estimators'] = [64] params['class_weight'] = [None, 'balanced'] params['min_samples_leaf'] = [1, 2, 3] results = [] for depth, nest, cw, min_samples in itertools.product(params['max_depth'], params['n_estimators'], params['class_weight'], params['min_samples_leaf']): print('Params:') print('depth: {}, n_estimators: {}, class_weight: {}, min_samples_leaf: {}'.format(depth, nest, cw, min_samples)) train2 = cs_detection.ClearskyDetection(train.df) train2.trim_dates('01-01-1999', '01-01-2014') utils.calc_all_window_metrics(train2.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True) test2 = cs_detection.ClearskyDetection(train.df) test2.trim_dates('01-01-2014', '01-01-2015') clf = ensemble.RandomForestClassifier(max_depth=depth, n_estimators=nest, class_weight=cw, min_samples_leaf=min_samples, n_jobs=-1, random_state=42) clf.fit(train2.df[train2.df['GHI'] > 0][feature_cols].values, train2.df[train2.df['GHI'] > 0][target_cols].values.flatten()) print('\t Scores:') test_pred = test2.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True) accuracy_score = metrics.accuracy_score(test2.df['sky_status'], test_pred) print('\t\t accuracy: {}'.format(accuracy_score)) f1_score = metrics.f1_score(test2.df['sky_status'], test_pred) print('\t\t f1:{}'.format(f1_score)) recall_score = metrics.recall_score(test2.df['sky_status'], test_pred) print('\t\t recall:{}'.format(recall_score)) precision_score = metrics.precision_score(test2.df['sky_status'], test_pred) print('\t\t precision:{}'.format(precision_score)) results.append({'max_depth': depth, 'n_estimators': nest, 'class_weight': cw, 'min_samples_leaf': min_samples, 'accuracy': accuracy_score, 'f1': f1_score, 'recall': recall_score, 'precision': precision_score})runs_df = pd.DataFrame(results)runs_df.to_csv('8_ornl_directional_features.csv')
In [19]:
runs_df = pd.read_csv('8_ornl_directional_features.csv')
In [20]:
runs_df[['accuracy', 'f1', 'recall', 'precision']].iplot(kind='box')
In [21]:
runs_df
Out[21]:
Unnamed: 0 accuracy class_weight f1 max_depth min_samples_leaf n_estimators precision recall
0 0 0.405479 NaN 0.261067 4 1 64 0.150659 0.977164
1 1 0.405479 NaN 0.261067 4 2 64 0.150659 0.977164
2 2 0.404395 NaN 0.260716 4 3 64 0.150425 0.977164
3 3 0.386929 balanced 0.257449 4 1 64 0.147989 0.988848
4 4 0.386929 balanced 0.257449 4 2 64 0.147989 0.988848
5 5 0.386929 balanced 0.257449 4 3 64 0.147989 0.988848
6 6 0.871518 NaN 0.621617 8 1 64 0.454747 0.981944
7 7 0.870205 NaN 0.618712 8 2 64 0.452095 0.979819
8 8 0.872546 NaN 0.622740 8 3 64 0.456640 0.978757
9 9 0.836929 balanced 0.565739 8 1 64 0.396295 0.988317
10 10 0.841381 balanced 0.572790 8 2 64 0.403072 0.989379
11 11 0.861986 balanced 0.606317 8 3 64 0.437192 0.988848
12 12 0.876655 NaN 0.630535 12 1 64 0.464952 0.979288
13 13 0.876427 NaN 0.629851 12 2 64 0.464448 0.978226
14 14 0.876027 NaN 0.629478 12 3 64 0.463684 0.979819
15 15 0.875856 balanced 0.630039 12 1 64 0.463463 0.983537
16 16 0.875285 balanced 0.628717 12 2 64 0.462269 0.982475
17 17 0.874943 balanced 0.629084 12 3 64 0.461730 0.986723
18 18 0.876484 NaN 0.627795 16 1 64 0.464258 0.969198
19 19 0.879110 NaN 0.633691 16 2 64 0.469864 0.972916
20 20 0.878767 NaN 0.632780 16 3 64 0.469110 0.971853
21 21 0.878938 balanced 0.633235 16 1 64 0.469487 0.972384
22 22 0.877169 balanced 0.631381 16 2 64 0.465992 0.978757
23 23 0.877511 balanced 0.632030 16 3 64 0.466700 0.978757

Best recall model

In [22]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [23]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [24]:
best_recall = runs_df.iloc[runs_df['recall'].idxmax()]
In [25]:
params_recall = best_recall[['max_depth', 'n_estimators', 'min_samples_leaf']].to_dict()
In [26]:
params_recall
Out[26]:
{'max_depth': 8, 'min_samples_leaf': 2, 'n_estimators': 64}

Best accuracy model

In [27]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [28]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [29]:
best_accuracy = runs_df.iloc[runs_df['accuracy'].idxmax()]
In [30]:
print(best_accuracy.equals(best_recall))
False
In [31]:
params_accuracy = best_accuracy[['max_depth', 'n_estimators', 'class_weight', 'min_samples_leaf']].to_dict()
In [32]:
params_accuracy
Out[32]:
{'class_weight': nan,
 'max_depth': 16,
 'min_samples_leaf': 2,
 'n_estimators': 64}
In [33]:
params_accuracy['class_weight'] = None

Best precision model

In [34]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [35]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [36]:
best_precision = runs_df.iloc[runs_df['precision'].idxmax()]
In [37]:
print(best_precision.equals(best_recall))
print(best_precision.equals(best_accuracy))
False
True
In [38]:
params_precision = best_precision[['max_depth', 'n_estimators', 'class_weight', 'min_samples_leaf']].to_dict()

Same model as accuracy.

Best f1 model

In [39]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [40]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [41]:
best_f1 = runs_df.iloc[runs_df['f1'].idxmax()]
In [42]:
print(best_f1.equals(best_recall))
print(best_f1.equals(best_accuracy))
print(best_f1.equals(best_precision))
False
True
True

Same model as best accuracy and precision - scroll up.

Train on all NSRDB data, test various freq of ground data

In [43]:
best_precision['class_weight'] = None

train = cs_detection.ClearskyDetection(nsrdb.df)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
clf = ensemble.RandomForestClassifier(**best_precision[['max_depth', 'class_weight', 'min_samples_leaf']].to_dict(), n_estimators=100)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())
/Users/benellis/miniconda3/lib/python3.5/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Out[43]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=16, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=2,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
In [44]:
bar = go.Bar(x=feature_cols, y=clf.feature_importances_)
iplot([bar])

30 min freq ground data

In [45]:
ground = cs_detection.ClearskyDetection.read_pickle('ornl_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('EST')
test = cs_detection.ClearskyDetection(ground.df)
In [46]:
test.trim_dates('10-01-2008', '11-01-2008')
In [47]:
test.df = test.df[test.df.index.minute % 30 == 0]
In [48]:
test.df.keys()
Out[48]:
Index(['GHI', 'Clearsky GHI pvlib', 'sky_status pvlib', 'ghi_status', 'scale'], dtype='object')
In [49]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
In [50]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:336: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:

Scaling did not converge.

In [51]:
train2 = cs_detection.ClearskyDetection(nsrdb.df)
train2.intersection(test.df.index)
In [52]:
nsrdb_clear = train2.df['sky_status'].values
ml_clear = pred
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
In [53]:
probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas[:, 1]
visualize.plot_ts_slider_highligther(test.df, prob='probas')
In [55]:
cm = metrics.confusion_matrix(train2.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])

15 min freq ground data

ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz') ground.df.index = ground.df.index.tz_convert('MST') test = cs_detection.ClearskyDetection(ground.df)test.trim_dates('10-01-2015', '10-17-2015')test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')test.df = test.df[test.df.index.minute % 15 == 0] # test.df = test.df.resample('15T').apply(lambda x: x[len(x) // 2])pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 5, multiproc=True, by_day=True).astype(bool)train2 = cs_detection.ClearskyDetection(train.df) train2.trim_dates('10-01-2015', '10-17-2015') train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='15min')) train2.df['sky_status'] = train2.df['sky_status'].fillna(False)nsrdb_clear = train2.df['sky_status'] ml_clear = test.df['sky_status iter'] vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear') vis.show()probas = clf.predict_proba(test.df[feature_cols].values) test.df['probas'] = 0 test.df['probas'] = probas[:, 1] visualize.plot_ts_slider_highligther(test.df, prob='probas')## 10 min freq ground dataground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz') ground.df.index = ground.df.index.tz_convert('MST') test = cs_detection.ClearskyDetection(ground.df)test.trim_dates('10-01-2015', '10-08-2015')test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn') test.scale_by_irrad('Clearsky GHI pvlib')test.df = test.df[test.df.index.minute % 10 == 0]pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 7, multiproc=True, by_day=True).astype(bool)train2 = cs_detection.ClearskyDetection(train.df) train2.trim_dates('10-01-2015', '10-08-2015') train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='10min')) train2.df['sky_status'] = train2.df['sky_status'].fillna(False)nsrdb_clear = train2.df['sky_status'] ml_clear = test.df['sky_status iter'] vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear') vis.show()probas = clf.predict_proba(test.df[feature_cols].values) test.df['probas'] = 0 test.df['probas'] = probas[:, 1] visualize.plot_ts_slider_highligther(test.df, prob='probas')## 5 min freq ground dataground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz') ground.df.index = ground.df.index.tz_convert('MST') test = cs_detection.ClearskyDetection(ground.df)test.trim_dates('10-01-2015', '10-04-2015')test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn') test.scale_by_irrad('Clearsky GHI pvlib')test.df = test.df[test.df.index.minute % 5 == 0]pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 13, multiproc=True, by_day=True).astype(bool)train2 = cs_detection.ClearskyDetection(train.df) train2.trim_dates('10-01-2015', '10-17-2015') train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='5min')) train2.df['sky_status'] = train2.df['sky_status'].fillna(False)nsrdb_clear = train2.df['sky_status'] ml_clear = test.df['sky_status iter'] vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear') vis.show()probas = clf.predict_proba(test.df[feature_cols].values) test.df['probas'] = 0 test.df['probas'] = probas[:, 1] visualize.plot_ts_slider_highligther(test.df, prob='probas')## 1 min freq ground dataground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz') ground.df.index = ground.df.index.tz_convert('MST') test = cs_detection.ClearskyDetection(ground.df)test.trim_dates('10-01-2015', '10-08-2015')test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn') test.scale_by_irrad('Clearsky GHI pvlib')test.df = test.df[test.df.index.minute % 1 == 0]pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 61, multiproc=True, by_day=True).astype(bool)train2 = cs_detection.ClearskyDetection(train.df) train2.trim_dates('10-01-2015', '10-08-2015') train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='1min')) train2.df['sky_status'] = train2.df['sky_status'].fillna(False)nsrdb_clear = train2.df['sky_status'] ml_clear = test.df['sky_status iter'] vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear') vis.show()probas = clf.predict_proba(test.df[feature_cols].values) test.df['probas'] = 0 test.df['probas'] = probas[:, 1] visualize.plot_ts_slider_highligther(test.df, prob='probas')# Save modelimport picklewith open('8_abq_direction_features_model.pkl', 'wb') as f: pickle.dump(clf, f)!ls *abq*# ConclusionIn general, the clear sky identification looks good. At lower frequencies (30 min, 15 min) we see good agreement with NSRDB labeled points. I suspect this could be further improved my doing a larger hyperparameter search, or even doing some feature extraction/reduction/additions.
In [ ]: